import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
plotly.offline.init_notebook_mode()
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,mean_absolute_error,mean_squared_error,r2_score
# reading the CSV file.
df_parkinson = pd.read_csv("csv/pd_speech_features.csv", header=1)
# first five rows of the data
df_parkinson.head()
| id | gender | PPE | DFA | RPDE | numPulses | numPeriodsPulses | meanPeriodPulses | stdDevPeriodPulses | locPctJitter | ... | tqwt_kurtosisValue_dec_28 | tqwt_kurtosisValue_dec_29 | tqwt_kurtosisValue_dec_30 | tqwt_kurtosisValue_dec_31 | tqwt_kurtosisValue_dec_32 | tqwt_kurtosisValue_dec_33 | tqwt_kurtosisValue_dec_34 | tqwt_kurtosisValue_dec_35 | tqwt_kurtosisValue_dec_36 | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 0.85247 | 0.71826 | 0.57227 | 240 | 239 | 0.008064 | 0.000087 | 0.00218 | ... | 1.5620 | 2.6445 | 3.8686 | 4.2105 | 5.1221 | 4.4625 | 2.6202 | 3.0004 | 18.9405 | 1 |
| 1 | 0 | 1 | 0.76686 | 0.69481 | 0.53966 | 234 | 233 | 0.008258 | 0.000073 | 0.00195 | ... | 1.5589 | 3.6107 | 23.5155 | 14.1962 | 11.0261 | 9.5082 | 6.5245 | 6.3431 | 45.1780 | 1 |
| 2 | 0 | 1 | 0.85083 | 0.67604 | 0.58982 | 232 | 231 | 0.008340 | 0.000060 | 0.00176 | ... | 1.5643 | 2.3308 | 9.4959 | 10.7458 | 11.0177 | 4.8066 | 2.9199 | 3.1495 | 4.7666 | 1 |
| 3 | 1 | 0 | 0.41121 | 0.79672 | 0.59257 | 178 | 177 | 0.010858 | 0.000183 | 0.00419 | ... | 3.7805 | 3.5664 | 5.2558 | 14.0403 | 4.2235 | 4.6857 | 4.8460 | 6.2650 | 4.0603 | 1 |
| 4 | 1 | 0 | 0.32790 | 0.79782 | 0.53028 | 236 | 235 | 0.008162 | 0.002669 | 0.00535 | ... | 6.1727 | 5.8416 | 6.0805 | 5.7621 | 7.7817 | 11.6891 | 8.2103 | 5.0559 | 6.1164 | 1 |
5 rows × 755 columns
df_parkinson.shape
(756, 755)
# displaying the summary of the DataFrame,
df_parkinson.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 756 entries, 0 to 755 Columns: 755 entries, id to class dtypes: float64(749), int64(6) memory usage: 4.4 MB
# summary statistics of a DataFrame.
df_parkinson.describe()
| id | gender | PPE | DFA | RPDE | numPulses | numPeriodsPulses | meanPeriodPulses | stdDevPeriodPulses | locPctJitter | ... | tqwt_kurtosisValue_dec_28 | tqwt_kurtosisValue_dec_29 | tqwt_kurtosisValue_dec_30 | tqwt_kurtosisValue_dec_31 | tqwt_kurtosisValue_dec_32 | tqwt_kurtosisValue_dec_33 | tqwt_kurtosisValue_dec_34 | tqwt_kurtosisValue_dec_35 | tqwt_kurtosisValue_dec_36 | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | ... | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 | 756.000000 |
| mean | 125.500000 | 0.515873 | 0.746284 | 0.700414 | 0.489058 | 323.972222 | 322.678571 | 0.006360 | 0.000383 | 0.002324 | ... | 26.237251 | 22.840337 | 18.587888 | 13.872018 | 12.218953 | 12.375335 | 14.799230 | 14.751559 | 31.481110 | 0.746032 |
| std | 72.793721 | 0.500079 | 0.169294 | 0.069718 | 0.137442 | 99.219059 | 99.402499 | 0.001826 | 0.000728 | 0.002628 | ... | 42.220693 | 32.626464 | 25.537464 | 20.046029 | 17.783642 | 16.341665 | 15.722502 | 14.432979 | 34.230991 | 0.435568 |
| min | 0.000000 | 0.000000 | 0.041551 | 0.543500 | 0.154300 | 2.000000 | 1.000000 | 0.002107 | 0.000011 | 0.000210 | ... | 1.509800 | 1.531700 | 1.582900 | 1.747200 | 1.789500 | 1.628700 | 1.861700 | 1.955900 | 2.364000 | 0.000000 |
| 25% | 62.750000 | 0.000000 | 0.762833 | 0.647053 | 0.386537 | 251.000000 | 250.000000 | 0.005003 | 0.000049 | 0.000970 | ... | 2.408675 | 3.452800 | 3.354825 | 3.077450 | 2.937025 | 3.114375 | 3.665925 | 3.741275 | 3.948750 | 0.000000 |
| 50% | 125.500000 | 1.000000 | 0.809655 | 0.700525 | 0.484355 | 317.000000 | 316.000000 | 0.006048 | 0.000077 | 0.001495 | ... | 5.586300 | 7.062750 | 6.077400 | 4.770850 | 4.300450 | 4.741450 | 6.725700 | 7.334250 | 10.637250 | 1.000000 |
| 75% | 188.250000 | 1.000000 | 0.834315 | 0.754985 | 0.586515 | 384.250000 | 383.250000 | 0.007528 | 0.000171 | 0.002520 | ... | 28.958075 | 29.830850 | 21.944050 | 13.188000 | 10.876150 | 12.201325 | 21.922050 | 22.495175 | 61.125325 | 1.000000 |
| max | 251.000000 | 1.000000 | 0.907660 | 0.852640 | 0.871230 | 907.000000 | 905.000000 | 0.012966 | 0.003483 | 0.027750 | ... | 239.788800 | 203.311300 | 121.542900 | 102.207000 | 85.571700 | 73.532200 | 62.007300 | 57.544300 | 156.423700 | 1.000000 |
8 rows × 755 columns
# counting the number of missing values (NaN) in each column of the DataFrame.
df_parkinson.isnull().sum()
id 0
gender 0
PPE 0
DFA 0
RPDE 0
..
tqwt_kurtosisValue_dec_33 0
tqwt_kurtosisValue_dec_34 0
tqwt_kurtosisValue_dec_35 0
tqwt_kurtosisValue_dec_36 0
class 0
Length: 755, dtype: int64
# calculating the total number of duplicated rows in the DataFrame.
df_parkinson.duplicated().sum()
1
# removing any duplicate rows from the DataFrame.
df_parkinson.drop_duplicates(inplace=True)
# creating a pie chart to visualize the distribution of the 'class' feature in the DataFrame.
class_count = df_parkinson['class'].value_counts()
plt.pie(class_count.values, labels = class_count.index,
autopct='%1.2f%%')
plt.title("Distribution of class feature")
plt.show()
# creating a histogram plot to show the distrubution of `class` features with `gender` in the DataFrame.
fig=px.histogram(df_parkinson, x="class",
color="gender",
title="Distribution of class vs gender", barmode="group")
fig.show()
# seperating the values into X and y
X = df_parkinson.drop('class', axis=1)
y = df_parkinson['class']
# Stadarizing the data
scaler = StandardScaler()
scaler.fit(X)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# transformaing the X data
X = scaler.transform(X)
print(X)
[[-1.72735399 0.96741792 0.62760454 ... -0.77588335 -0.81550881 -0.36727633] [-1.72735399 0.96741792 0.12189187 ... -0.52746783 -0.58381833 0.39940641] [-1.72735399 0.96741792 0.61791678 ... -0.75681459 -0.80517433 -0.7814501 ] ... [ 1.72413026 -1.03367942 0.81320776 ... -0.71750637 -0.79096529 -0.77339099] [ 1.72413026 -1.03367942 0.54106452 ... -0.77207214 -0.82709784 -0.8122343 ] [ 1.72413026 -1.03367942 0.3946849 ... -0.68735394 -0.8417574 -0.82860968]]
X.shape, y.shape
((755, 754), (755,))
# splitting the dataset `X` and corresponding labels `y` into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify = y,
random_state=36) # using stratify due to class imbalance
# dictionary containing the various models.
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"XGB Classifier": XGBClassifier(),
"SVC": SVC(),
"Random Forest Classifier": RandomForestClassifier(),
"Decision Tree Classifer": DecisionTreeClassifier(),
"KNeighbours Classifier": KNeighborsClassifier()
}
# iterating over various models and finding out the accuracy of each model to find the best one with cross-validation on 10 folds.
for model_name, model in models.items():
cv_score = cross_val_score(model, X, y, cv=10)
avg_accuracy = cv_score.mean() * 100
print(f"The Cross-validation average accuracy of {model_name} model: {avg_accuracy:.2f}%")
The Cross-validation average accuracy of Logistic Regression model: 78.42% The Cross-validation average accuracy of XGB Classifier model: 84.23% The Cross-validation average accuracy of SVC model: 83.97% The Cross-validation average accuracy of Random Forest Classifier model: 83.71% The Cross-validation average accuracy of Decision Tree Classifer model: 72.29% The Cross-validation average accuracy of KNeighbours Classifier model: 80.52%
From the above cross validation accuracy score of various models we find that XGB Classifier is the one with the highest accuray for this dataset, while perfoming corss-validation on 10 folds with an mean accuracy score of 84.23%. Hence, we will select XGB classifier for our other processes and prediction.
# making prediction using the best model i.e. XGB classifier
modelXGB = models['XGB Classifier']
modelXGB.fit(X_train, y_train)
y_predict = modelXGB.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"The accuracy of XGB Classifier model: {accuracy*100:.2f}%")
The accuracy of XGB Classifier model: 89.40%
# calcualting and plotting confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print(conf_matrix)
print("Confusion Matrix:")
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
[[ 25 13] [ 3 110]] Confusion Matrix:
Observing our confusion matrix we find that:
The model correctly identified all 25 persons not having Parikinson disease (True Negatives).
The model incorrectly classified 13 healthy persons as having Parkinson. (False Positive)
The model correctly predicted 110 person that have Parkinson disease (True Positive).
The model incorrectly predicted 3 person as not having Parksinson disease when they have it (False Negative).
The model accurately predicted the presence of Parkinson disease in approximately 89.40% of the instances.
# generating a classification report for a XGB model's predictions on a test dataset.
parkinson_classficationreport = classification_report(y_test, y_predict)
print("Classification Report: \n", parkinson_classficationreport)
Classification Report:
precision recall f1-score support
0 0.89 0.66 0.76 38
1 0.89 0.97 0.93 113
accuracy 0.89 151
macro avg 0.89 0.82 0.84 151
weighted avg 0.89 0.89 0.89 151
# calculating and printing various evaluation metrics for the best model's (XGB) predictions.
mse = mean_squared_error(y_test, y_predict)
mae = mean_absolute_error(y_test, y_predict)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)
print("Perfomance Mertis for XGB Classifier:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R-squared: {r2:.4f}")
Perfomance Mertis for XGB Classifier: Mean Squared Error: 0.1060 Root Mean Squared Error: 0.3255 Mean Absolute Error: 0.1060 R-squared: 0.4374
Observing the metrics we have a low Mean Absolute Error and Root Mean Square indicating the predicted value are closer to the true value. Our R-squared is around 0.4373 which means that around 43.74% of variability of the target class is explained by the model.
# predicting whether a single instance of data has Parkinson's disease or not using a trained XGBoost model (`modelXGB`).
single_instance = X_test[1].reshape(1, -1)
prediction = modelXGB.predict(single_instance)
print(f"Predicted class for the single instance: {prediction[0]}")
print('This person may have Parkinson disease.' if prediction == 1 else 'This person might not have Parkinson disease.')
Predicted class for the single instance: 1 This person may have Parkinson disease.
# trying an instance with the opposite results
second_instance = X_test[4].reshape(1, -1)
prediction2 = modelXGB.predict(second_instance)
print(f"Predicted class for the second instance: {prediction2[0]}")
print('This person may have Parkinson disease.' if prediction2 == 1 else 'This person might not have Parkinson disease.')
Predicted class for the second instance: 0 This person might not have Parkinson disease.